Libraries required for this analysis

knitr::opts_chunk$set(fig.align="center") 
library(rstanarm)
library(tidyverse)
library(tidybayes)
library(modelr) 
library(ggplot2)
library(magrittr)  
library(emmeans)
library(bayesplot)
library(brms)
library(gganimate)

theme_set(theme_light())


source('helper_functions.R')

In our experiment, we used a visualization recommendation algorithm (composed of one search algorithm and one oracle algorithm) to generate visualizations for the user on one of two datasets. We then asked the user to evaluate the tool on a variety of metrics (confidence in understanding data, confidence in answer, efficiency, ease of use, utility, and overall).

Given a search algorithm (bfs or dfs), an oracle (CompassQL or dziban), and a dataset (birdstrikes or movies), we would like to predict a user’s score for a given metric. In addition, we would like to know if the choice of search algorithm and oracle has any meaningful impact on a user’s ratong for these metrics.

Read in and clean data

analyses = c("confidence.udata", "confidence.ans", "efficiency", "ease.of.use", "utility", "overall")
confidence_metrics = c("confidence.udata", "confidence.ans")
preference_metrics = c("efficiency", "ease.of.use", "utility", "overall")

user_response_data <- read.csv('split_by_participant_groups/ptask_responses.csv')
analyses = c("confidence.udata", "confidence.ans", "efficiency", "ease.of.use", "utility", "overall")

user_response_data$oracle<- gsub('compassql', 'CompassQL', user_response_data$oracle)
user_response_data$oracle<- gsub('dziban', 'Dziban', user_response_data$oracle)

user_response_data$search<- gsub('bfs', 'BFS', user_response_data$search)
user_response_data$search<- gsub('dfs', 'DFS', user_response_data$search)

user_response_data[,analyses] <- lapply(user_response_data[,analyses],ordered)
user_response_data <- user_response_data %>%
  mutate(
    dataset = as.factor(dataset),
    oracle = as.factor(oracle),
    search = as.factor(search),
    task = as.factor(task)
  )

models <- list()

search_differences <- list()
oracle_differences <- list()
alg_differences <- list()
participant_group_differences <- list()

seed = 12

Analysis for user responses

Confidence in Understanding Data: Building a Model

models$confidence_udata <- brm(
    formula = bf(confidence.udata ~ oracle * search * dataset * task + participant_group + (1 | participant_id)),
    family = cumulative("probit"),
    prior = prior(normal(0.26, 1.26), class = Intercept),
    chains = 2,
    cores = 2,
    iter = 2500,
    warmup = 1000,
    data = user_response_data,
    control = list(adapt_delta = 0.99),
    file = "models/confidence_udata",
    seed = seed
  )
## Compiling Stan program...
## Trying to compile a simple C file
## Running /Library/Frameworks/R.framework/Resources/bin/R CMD SHLIB foo.c
## clang -mmacosx-version-min=10.13 -I"/Library/Frameworks/R.framework/Resources/include" -DNDEBUG   -I"/Library/Frameworks/R.framework/Versions/4.0/Resources/library/Rcpp/include/"  -I"/Library/Frameworks/R.framework/Versions/4.0/Resources/library/RcppEigen/include/"  -I"/Library/Frameworks/R.framework/Versions/4.0/Resources/library/RcppEigen/include/unsupported"  -I"/Library/Frameworks/R.framework/Versions/4.0/Resources/library/BH/include" -I"/Library/Frameworks/R.framework/Versions/4.0/Resources/library/StanHeaders/include/src/"  -I"/Library/Frameworks/R.framework/Versions/4.0/Resources/library/StanHeaders/include/"  -I"/Library/Frameworks/R.framework/Versions/4.0/Resources/library/RcppParallel/include/"  -I"/Library/Frameworks/R.framework/Versions/4.0/Resources/library/rstan/include" -DEIGEN_NO_DEBUG  -DBOOST_DISABLE_ASSERTS  -DBOOST_PENDING_INTEGER_LOG2_HPP  -DSTAN_THREADS  -DBOOST_NO_AUTO_PTR  -include '/Library/Frameworks/R.framework/Versions/4.0/Resources/library/StanHeaders/include/stan/math/prim/mat/fun/Eigen.hpp'  -D_REENTRANT -DRCPP_PARALLEL_USE_TBB=1   -I/usr/local/include   -fPIC  -Wall -g -O2  -c foo.c -o foo.o
## In file included from <built-in>:1:
## In file included from /Library/Frameworks/R.framework/Versions/4.0/Resources/library/StanHeaders/include/stan/math/prim/mat/fun/Eigen.hpp:13:
## In file included from /Library/Frameworks/R.framework/Versions/4.0/Resources/library/RcppEigen/include/Eigen/Dense:1:
## In file included from /Library/Frameworks/R.framework/Versions/4.0/Resources/library/RcppEigen/include/Eigen/Core:88:
## /Library/Frameworks/R.framework/Versions/4.0/Resources/library/RcppEigen/include/Eigen/src/Core/util/Macros.h:613:1: error: unknown type name 'namespace'
## namespace Eigen {
## ^
## /Library/Frameworks/R.framework/Versions/4.0/Resources/library/RcppEigen/include/Eigen/src/Core/util/Macros.h:613:16: error: expected ';' after top level declarator
## namespace Eigen {
##                ^
##                ;
## In file included from <built-in>:1:
## In file included from /Library/Frameworks/R.framework/Versions/4.0/Resources/library/StanHeaders/include/stan/math/prim/mat/fun/Eigen.hpp:13:
## In file included from /Library/Frameworks/R.framework/Versions/4.0/Resources/library/RcppEigen/include/Eigen/Dense:1:
## /Library/Frameworks/R.framework/Versions/4.0/Resources/library/RcppEigen/include/Eigen/Core:96:10: fatal error: 'complex' file not found
## #include <complex>
##          ^~~~~~~~~
## 3 errors generated.
## make: *** [foo.o] Error 1
## Start sampling

Check some diagnostics regarding our model. Rhat should be close to 1 and Bulk_ESS should be in the thousands.

summary(models$confidence_udata)
##  Family: cumulative 
##   Links: mu = probit; disc = identity 
## Formula: confidence.udata ~ oracle * search * dataset * task + participant_group + (1 | participant_id) 
##    Data: user_response_data (Number of observations: 264) 
## Samples: 2 chains, each with iter = 2500; warmup = 1000; thin = 1;
##          total post-warmup samples = 3000
## 
## Group-Level Effects: 
## ~participant_id (Number of levels: 66) 
##               Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept)     1.22      0.19     0.87     1.63 1.00      554      946
## 
## Population-Level Effects: 
##                                                          Estimate Est.Error
## Intercept[1]                                                -2.03      0.66
## Intercept[2]                                                -0.77      0.65
## Intercept[3]                                                 1.67      0.66
## oracleDziban                                                 0.16      0.87
## searchDFS                                                   -0.65      0.84
## datasetmovies                                               -0.09      0.89
## task2.RetrieveValue                                          0.17      0.57
## task3.Prediction                                             0.46      0.59
## task4.Exploration                                            0.74      0.60
## participant_groupstudent                                     0.47      0.35
## oracleDziban:searchDFS                                      -0.10      1.22
## oracleDziban:datasetmovies                                   0.78      1.22
## searchDFS:datasetmovies                                      0.07      1.21
## oracleDziban:task2.RetrieveValue                            -0.19      0.83
## oracleDziban:task3.Prediction                               -0.70      0.84
## oracleDziban:task4.Exploration                               0.58      0.88
## searchDFS:task2.RetrieveValue                                0.31      0.81
## searchDFS:task3.Prediction                                  -0.09      0.84
## searchDFS:task4.Exploration                                 -0.68      0.85
## datasetmovies:task2.RetrieveValue                            0.21      0.86
## datasetmovies:task3.Prediction                              -0.67      0.86
## datasetmovies:task4.Exploration                             -0.14      0.87
## oracleDziban:searchDFS:datasetmovies                         0.21      1.67
## oracleDziban:searchDFS:task2.RetrieveValue                   0.49      1.18
## oracleDziban:searchDFS:task3.Prediction                      1.76      1.21
## oracleDziban:searchDFS:task4.Exploration                     0.46      1.23
## oracleDziban:datasetmovies:task2.RetrieveValue              -1.02      1.20
## oracleDziban:datasetmovies:task3.Prediction                 -0.15      1.21
## oracleDziban:datasetmovies:task4.Exploration                -1.80      1.26
## searchDFS:datasetmovies:task2.RetrieveValue                  0.19      1.16
## searchDFS:datasetmovies:task3.Prediction                     0.63      1.17
## searchDFS:datasetmovies:task4.Exploration                    1.18      1.20
## oracleDziban:searchDFS:datasetmovies:task2.RetrieveValue     0.71      1.66
## oracleDziban:searchDFS:datasetmovies:task3.Prediction       -0.80      1.72
## oracleDziban:searchDFS:datasetmovies:task4.Exploration       0.92      1.75
##                                                          l-95% CI u-95% CI Rhat
## Intercept[1]                                                -3.36    -0.77 1.01
## Intercept[2]                                                -2.06     0.49 1.01
## Intercept[3]                                                 0.45     3.02 1.01
## oracleDziban                                                -1.45     1.89 1.00
## searchDFS                                                   -2.31     1.01 1.01
## datasetmovies                                               -1.81     1.70 1.00
## task2.RetrieveValue                                         -0.93     1.29 1.01
## task3.Prediction                                            -0.67     1.56 1.00
## task4.Exploration                                           -0.41     1.87 1.01
## participant_groupstudent                                    -0.20     1.17 1.01
## oracleDziban:searchDFS                                      -2.53     2.26 1.01
## oracleDziban:datasetmovies                                  -1.61     3.19 1.01
## searchDFS:datasetmovies                                     -2.33     2.42 1.01
## oracleDziban:task2.RetrieveValue                            -1.79     1.48 1.00
## oracleDziban:task3.Prediction                               -2.31     0.92 1.00
## oracleDziban:task4.Exploration                              -1.10     2.36 1.01
## searchDFS:task2.RetrieveValue                               -1.28     1.90 1.01
## searchDFS:task3.Prediction                                  -1.75     1.52 1.00
## searchDFS:task4.Exploration                                 -2.32     0.98 1.01
## datasetmovies:task2.RetrieveValue                           -1.49     1.80 1.00
## datasetmovies:task3.Prediction                              -2.36     0.96 1.01
## datasetmovies:task4.Exploration                             -1.96     1.51 1.01
## oracleDziban:searchDFS:datasetmovies                        -3.00     3.40 1.01
## oracleDziban:searchDFS:task2.RetrieveValue                  -1.80     2.78 1.01
## oracleDziban:searchDFS:task3.Prediction                     -0.63     4.11 1.00
## oracleDziban:searchDFS:task4.Exploration                    -1.91     2.91 1.01
## oracleDziban:datasetmovies:task2.RetrieveValue              -3.38     1.34 1.00
## oracleDziban:datasetmovies:task3.Prediction                 -2.47     2.26 1.00
## oracleDziban:datasetmovies:task4.Exploration                -4.33     0.74 1.01
## searchDFS:datasetmovies:task2.RetrieveValue                 -2.02     2.45 1.01
## searchDFS:datasetmovies:task3.Prediction                    -1.64     2.90 1.01
## searchDFS:datasetmovies:task4.Exploration                   -1.21     3.53 1.01
## oracleDziban:searchDFS:datasetmovies:task2.RetrieveValue    -2.50     3.92 1.01
## oracleDziban:searchDFS:datasetmovies:task3.Prediction       -4.13     2.64 1.01
## oracleDziban:searchDFS:datasetmovies:task4.Exploration      -2.45     4.35 1.01
##                                                          Bulk_ESS Tail_ESS
## Intercept[1]                                                  324      788
## Intercept[2]                                                  308      780
## Intercept[3]                                                  317      718
## oracleDziban                                                  339      591
## searchDFS                                                     336      731
## datasetmovies                                                 348      860
## task2.RetrieveValue                                           338      725
## task3.Prediction                                              341      786
## task4.Exploration                                             372      965
## participant_groupstudent                                      473     1238
## oracleDziban:searchDFS                                        361      619
## oracleDziban:datasetmovies                                    330      728
## searchDFS:datasetmovies                                       358      893
## oracleDziban:task2.RetrieveValue                              314      648
## oracleDziban:task3.Prediction                                 293      769
## oracleDziban:task4.Exploration                                363      885
## searchDFS:task2.RetrieveValue                                 363      903
## searchDFS:task3.Prediction                                    316      688
## searchDFS:task4.Exploration                                   437      873
## datasetmovies:task2.RetrieveValue                             404     1073
## datasetmovies:task3.Prediction                                373     1180
## datasetmovies:task4.Exploration                               453     1083
## oracleDziban:searchDFS:datasetmovies                          336      731
## oracleDziban:searchDFS:task2.RetrieveValue                    341      992
## oracleDziban:searchDFS:task3.Prediction                       268      799
## oracleDziban:searchDFS:task4.Exploration                      371      773
## oracleDziban:datasetmovies:task2.RetrieveValue                350      837
## oracleDziban:datasetmovies:task3.Prediction                   355     1146
## oracleDziban:datasetmovies:task4.Exploration                  409      927
## searchDFS:datasetmovies:task2.RetrieveValue                   383      791
## searchDFS:datasetmovies:task3.Prediction                      323      966
## searchDFS:datasetmovies:task4.Exploration                     421      877
## oracleDziban:searchDFS:datasetmovies:task2.RetrieveValue      344     1039
## oracleDziban:searchDFS:datasetmovies:task3.Prediction         296      874
## oracleDziban:searchDFS:datasetmovies:task4.Exploration        382      842
## 
## Family Specific Parameters: 
##      Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## disc     1.00      0.00     1.00     1.00 1.00     3000     3000
## 
## Samples were drawn using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

Trace plots help us check whether there is evidence of non-convergence for our model.

# plot(models$confidence_udata)

In our pairs plots, we want to make sure we don’t have highly correlated parameters (highly correlated parameters means that our model has difficulty differentiating the effect of such parameters).

pairs(
  models$confidence_udata,
  pars = c("b_Intercept[1]",
           "b_Intercept[2]",
           "b_Intercept[3]",
           "b_Intercept[4]"),
  fixed = TRUE
)

pairs(
  models$confidence_udata,
  pars = c("b_datasetmovies",
           "b_oracledziban",
           "b_searchdfs",
           "b_task2.RetrieveValue",
           "b_task3.Prediction",
           "b_task4.Exploration"),
  fixed = TRUE
)

We now look at a response for confidence in understanding the data using different combinations of search and oracle via draws from the model posterior. The thicker, shorter line represents the 95% credible interval, while the thinner, longer line represents the 50% credible interval.

confidence_udata_plot <- user_response_posterior_draws_plot(user_response_data, models$confidence_udata, NULL, "Oracle/Search Combination", "Rating")
confidence_udata_plot$plot

We can get the numeric values of the interval boundaries shown above with mean_qi

confidence_udata_plot$intervals
## # A tibble: 8 x 8
## # Groups:   search [2]
##   search oracle    rating .lower .upper .width .point .interval
##   <fct>  <fct>      <dbl>  <dbl>  <dbl>  <dbl> <chr>  <chr>    
## 1 BFS    CompassQL  1.00   0.672  1.31    0.95 mean   qi       
## 2 BFS    Dziban     1.04   0.691  1.34    0.95 mean   qi       
## 3 DFS    CompassQL  0.785  0.426  1.10    0.95 mean   qi       
## 4 DFS    Dziban     1.17   0.859  1.47    0.95 mean   qi       
## 5 BFS    CompassQL  1.00   0.906  1.11    0.5  mean   qi       
## 6 BFS    Dziban     1.04   0.926  1.15    0.5  mean   qi       
## 7 DFS    CompassQL  0.785  0.676  0.897   0.5  mean   qi       
## 8 DFS    Dziban     1.17   1.06   1.28    0.5  mean   qi
## Saving 7 x 5 in image

Confidence in Understanding Data: Differences Between Conditions

Next, we want to see if there is any significant difference in completion time between the two search algorithms (bfs and dfs) and the two oracles (dzbian and CompassQL).

confidence_udata_predictive_data <- user_response_data %>% add_predicted_draws(models$confidence_udata, seed = seed, re_formula = NA)
confidence_udata_predictive_data$alg <- paste(confidence_udata_predictive_data$search, confidence_udata_predictive_data$oracle)

Differences in user score by search algorithm.

search_differences$confidence_udata <- user_response_diff_plot(confidence_udata_predictive_data, "search", "confidence.udata", "Difference in Confidence in Understanding Data Rating", "Task", NULL)
## `summarise()` regrouping output by 'search', 'task' (override with `.groups` argument)
search_differences$confidence_udata$plot

Differences in user score by oracle.

oracle_differences$confidence_udata <- user_response_diff_plot(confidence_udata_predictive_data, "oracle", "confidence.udata", "Difference in Confidence in Understanding Data Rating", "Task", NULL)
## `summarise()` regrouping output by 'oracle', 'task' (override with `.groups` argument)
oracle_differences$confidence_udata$plot

Differences in user score by search and oracle combination (DFS CompassQL vs BFS Dziban only)

confidence_udata_predictive_data_subset <- subset(confidence_udata_predictive_data, alg %in% c("DFS CompassQL", "BFS Dziban"))

alg_differences$confidence_udata <- user_response_diff_plot(confidence_udata_predictive_data_subset, "alg", "confidence.udata", "Difference in Confidence in Understanding Data Rating", "Task", NULL)
## `summarise()` regrouping output by 'alg', 'task' (override with `.groups` argument)
alg_differences$confidence_udata$plot

Differences in user score by participant group

participant_group_differences$confidence_udata <- user_response_diff_plot(confidence_udata_predictive_data, "participant_group", "confidence.udata", "Difference in Confidence in Understanding Data Rating", "Task", NULL)
## `summarise()` regrouping output by 'participant_group', 'task' (override with `.groups` argument)
participant_group_differences$confidence_udata$plot

Confidence in Answer: Building a Model

models$confidence_ans <- brm(
    formula = bf(confidence.ans ~ oracle * search * dataset * task + participant_group + (1 | participant_id)),
    family = cumulative("probit"),
    prior = prior(normal(0.26, 1.26), class = Intercept),
    chains = 2,
    cores = 2,
    iter = 2500,
    warmup = 1000,
    data = user_response_data,
    control = list(adapt_delta = 0.99),
    file = "models/confidence_ans",
    seed = seed
  )
## Compiling Stan program...
## Start sampling

Check some diagnostics regarding our model. Rhat should be close to 1 and Bulk_ESS should be in the thousands.

summary(models$confidence_ans)
##  Family: cumulative 
##   Links: mu = probit; disc = identity 
## Formula: confidence.ans ~ oracle * search * dataset * task + participant_group + (1 | participant_id) 
##    Data: user_response_data (Number of observations: 264) 
## Samples: 2 chains, each with iter = 2500; warmup = 1000; thin = 1;
##          total post-warmup samples = 3000
## 
## Group-Level Effects: 
## ~participant_id (Number of levels: 66) 
##               Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept)     0.63      0.14     0.36     0.91 1.00      989     1536
## 
## Population-Level Effects: 
##                                                          Estimate Est.Error
## Intercept[1]                                                -3.55      0.59
## Intercept[2]                                                -2.70      0.54
## Intercept[3]                                                -1.82      0.53
## Intercept[4]                                                 0.08      0.51
## oracleDziban                                                 0.02      0.70
## searchDFS                                                   -0.21      0.68
## datasetmovies                                               -0.79      0.66
## task2.RetrieveValue                                          0.07      0.63
## task3.Prediction                                            -1.62      0.59
## task4.Exploration                                           -0.79      0.60
## participant_groupstudent                                     0.22      0.23
## oracleDziban:searchDFS                                      -0.30      0.96
## oracleDziban:datasetmovies                                   0.98      0.96
## searchDFS:datasetmovies                                      1.08      0.95
## oracleDziban:task2.RetrieveValue                            -0.10      0.88
## oracleDziban:task3.Prediction                                0.84      0.83
## oracleDziban:task4.Exploration                               0.51      0.84
## searchDFS:task2.RetrieveValue                                0.48      0.87
## searchDFS:task3.Prediction                                   0.43      0.79
## searchDFS:task4.Exploration                                 -0.25      0.83
## datasetmovies:task2.RetrieveValue                           -0.06      0.86
## datasetmovies:task3.Prediction                               0.81      0.78
## datasetmovies:task4.Exploration                              0.22      0.83
## oracleDziban:searchDFS:datasetmovies                        -0.78      1.35
## oracleDziban:searchDFS:task2.RetrieveValue                  -0.96      1.19
## oracleDziban:searchDFS:task3.Prediction                     -0.31      1.14
## oracleDziban:searchDFS:task4.Exploration                     0.25      1.17
## oracleDziban:datasetmovies:task2.RetrieveValue              -1.19      1.21
## oracleDziban:datasetmovies:task3.Prediction                 -1.89      1.14
## oracleDziban:datasetmovies:task4.Exploration                -1.22      1.16
## searchDFS:datasetmovies:task2.RetrieveValue                 -1.60      1.18
## searchDFS:datasetmovies:task3.Prediction                    -1.09      1.12
## searchDFS:datasetmovies:task4.Exploration                   -0.07      1.16
## oracleDziban:searchDFS:datasetmovies:task2.RetrieveValue     3.48      1.69
## oracleDziban:searchDFS:datasetmovies:task3.Prediction        2.39      1.63
## oracleDziban:searchDFS:datasetmovies:task4.Exploration       1.20      1.63
##                                                          l-95% CI u-95% CI Rhat
## Intercept[1]                                                -4.75    -2.44 1.00
## Intercept[2]                                                -3.81    -1.64 1.00
## Intercept[3]                                                -2.89    -0.80 1.00
## Intercept[4]                                                -0.96     1.06 1.00
## oracleDziban                                                -1.35     1.40 1.00
## searchDFS                                                   -1.56     1.17 1.00
## datasetmovies                                               -2.10     0.50 1.00
## task2.RetrieveValue                                         -1.18     1.31 1.01
## task3.Prediction                                            -2.81    -0.48 1.00
## task4.Exploration                                           -1.96     0.37 1.00
## participant_groupstudent                                    -0.22     0.67 1.00
## oracleDziban:searchDFS                                      -2.21     1.64 1.00
## oracleDziban:datasetmovies                                  -0.95     2.86 1.00
## searchDFS:datasetmovies                                     -0.79     2.94 1.00
## oracleDziban:task2.RetrieveValue                            -1.83     1.62 1.01
## oracleDziban:task3.Prediction                               -0.69     2.53 1.00
## oracleDziban:task4.Exploration                              -1.15     2.17 1.00
## searchDFS:task2.RetrieveValue                               -1.23     2.24 1.01
## searchDFS:task3.Prediction                                  -1.14     2.00 1.00
## searchDFS:task4.Exploration                                 -1.91     1.42 1.00
## datasetmovies:task2.RetrieveValue                           -1.70     1.62 1.00
## datasetmovies:task3.Prediction                              -0.73     2.35 1.00
## datasetmovies:task4.Exploration                             -1.42     1.79 1.00
## oracleDziban:searchDFS:datasetmovies                        -3.48     1.86 1.00
## oracleDziban:searchDFS:task2.RetrieveValue                  -3.34     1.46 1.01
## oracleDziban:searchDFS:task3.Prediction                     -2.54     1.93 1.00
## oracleDziban:searchDFS:task4.Exploration                    -2.00     2.57 1.00
## oracleDziban:datasetmovies:task2.RetrieveValue              -3.55     1.12 1.00
## oracleDziban:datasetmovies:task3.Prediction                 -4.13     0.30 1.00
## oracleDziban:datasetmovies:task4.Exploration                -3.40     1.09 1.00
## searchDFS:datasetmovies:task2.RetrieveValue                 -3.93     0.66 1.00
## searchDFS:datasetmovies:task3.Prediction                    -3.24     1.20 1.00
## searchDFS:datasetmovies:task4.Exploration                   -2.37     2.24 1.00
## oracleDziban:searchDFS:datasetmovies:task2.RetrieveValue     0.24     6.90 1.00
## oracleDziban:searchDFS:datasetmovies:task3.Prediction       -0.77     5.62 1.00
## oracleDziban:searchDFS:datasetmovies:task4.Exploration      -1.93     4.32 1.00
##                                                          Bulk_ESS Tail_ESS
## Intercept[1]                                                  700     1474
## Intercept[2]                                                  616     1073
## Intercept[3]                                                  591     1167
## Intercept[4]                                                  602     1254
## oracleDziban                                                  558     1001
## searchDFS                                                     543      881
## datasetmovies                                                 550     1151
## task2.RetrieveValue                                           652     1104
## task3.Prediction                                              666      954
## task4.Exploration                                             691     1092
## participant_groupstudent                                     1875     2000
## oracleDziban:searchDFS                                        549     1062
## oracleDziban:datasetmovies                                    555     1048
## searchDFS:datasetmovies                                       540      812
## oracleDziban:task2.RetrieveValue                              615     1268
## oracleDziban:task3.Prediction                                 695     1125
## oracleDziban:task4.Exploration                                702     1111
## searchDFS:task2.RetrieveValue                                 628      850
## searchDFS:task3.Prediction                                    703     1004
## searchDFS:task4.Exploration                                   674     1063
## datasetmovies:task2.RetrieveValue                             670     1037
## datasetmovies:task3.Prediction                                642      860
## datasetmovies:task4.Exploration                               685     1083
## oracleDziban:searchDFS:datasetmovies                          560     1190
## oracleDziban:searchDFS:task2.RetrieveValue                    631      985
## oracleDziban:searchDFS:task3.Prediction                       790     1183
## oracleDziban:searchDFS:task4.Exploration                      740     1034
## oracleDziban:datasetmovies:task2.RetrieveValue                664     1340
## oracleDziban:datasetmovies:task3.Prediction                   648     1283
## oracleDziban:datasetmovies:task4.Exploration                  715     1046
## searchDFS:datasetmovies:task2.RetrieveValue                   663     1033
## searchDFS:datasetmovies:task3.Prediction                      737     1137
## searchDFS:datasetmovies:task4.Exploration                     710     1218
## oracleDziban:searchDFS:datasetmovies:task2.RetrieveValue      702     1325
## oracleDziban:searchDFS:datasetmovies:task3.Prediction         793     1129
## oracleDziban:searchDFS:datasetmovies:task4.Exploration        729     1256
## 
## Family Specific Parameters: 
##      Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## disc     1.00      0.00     1.00     1.00 1.00     3000     3000
## 
## Samples were drawn using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

Trace plots help us check whether there is evidence of non-convergence for our model.

# plot(models$confidence_ans)

In our pairs plots, we want to make sure we don’t have highly correlated parameters (highly correlated parameters means that our model has difficulty differentiating the effect of such parameters).

pairs(
  models$confidence_ans,
  pars = c("b_Intercept[1]",
           "b_Intercept[2]",
           "b_Intercept[3]",
           "b_Intercept[4]"),
  fixed = TRUE
)

pairs(
  models$confidence_ans,
  pars = c("b_datasetmovies",
           "b_oracledziban",
           "b_searchdfs",
           "b_task2.RetrieveValue",
           "b_task3.Prediction",
           "b_task4.Exploration"),
  fixed = TRUE
)

We now look at a response for confidence in answer using different combinations of search and oracle via draws from the model posterior. The thicker, shorter line represents the 95% credible interval, while the thinner, longer line represents the 50% credible interval.

confidence_ans_plot <- user_response_posterior_draws_plot(user_response_data, models$confidence_ans, NULL, "Oracle/Search Combination", "Rating")
confidence_ans_plot$plot

We can get the numeric values of the interval boundaries shown above with mean_qi

confidence_ans_plot$intervals
## # A tibble: 8 x 8
## # Groups:   search [2]
##   search oracle    rating .lower .upper .width .point .interval
##   <fct>  <fct>      <dbl>  <dbl>  <dbl>  <dbl> <chr>  <chr>    
## 1 BFS    CompassQL  0.992  0.656   1.27   0.95 mean   qi       
## 2 BFS    Dziban     1.14   0.838   1.40   0.95 mean   qi       
## 3 DFS    CompassQL  1.07   0.765   1.34   0.95 mean   qi       
## 4 DFS    Dziban     1.22   0.937   1.48   0.95 mean   qi       
## 5 BFS    CompassQL  0.992  0.891   1.09   0.5  mean   qi       
## 6 BFS    Dziban     1.14   1.04    1.24   0.5  mean   qi       
## 7 DFS    CompassQL  1.07   0.971   1.18   0.5  mean   qi       
## 8 DFS    Dziban     1.22   1.12    1.31   0.5  mean   qi
## Saving 7 x 5 in image

Confidence in Answer: Differences Between Conditions

Next, we want to see if there is any significant difference in completion time between the two search algorithms (bfs and dfs) and the two oracles (dzbian and CompassQL).

confidence_ans_predictive_data <- user_response_data %>% add_predicted_draws(models$confidence_ans, seed = seed, re_formula = NA) 
confidence_ans_predictive_data$alg <- paste(confidence_ans_predictive_data$search, confidence_ans_predictive_data$oracle)

Differences in user score by search algorithm.

search_differences$confidence_ans <- user_response_diff_plot(confidence_ans_predictive_data, "search", "confidence.ans", "Difference in Confidence in Answer Rating", "Task", NULL)
## `summarise()` regrouping output by 'search', 'task' (override with `.groups` argument)
search_differences$confidence_ans$plot

Differences in user score by oracle.

oracle_differences$confidence_ans <- user_response_diff_plot(confidence_ans_predictive_data, "oracle", "confidence.ans", "Difference in Confidence in Answer Rating", "Task", NULL)
## `summarise()` regrouping output by 'oracle', 'task' (override with `.groups` argument)
oracle_differences$confidence_ans$plot

Differences in user score by search and oracle combination (DFS CompassQL vs BFS Dziban only)

confidence_ans_predictive_data_subset <- subset(confidence_ans_predictive_data, alg %in% c("DFS CompassQL", "BFS Dziban"))
alg_differences$confidence_ans <- user_response_diff_plot(confidence_ans_predictive_data_subset, "alg", "confidence.ans", "Difference in Confidence in Answer Rating", "Task", NULL)
## `summarise()` regrouping output by 'alg', 'task' (override with `.groups` argument)
alg_differences$confidence_ans$plot

Differences in user score by participant group

participant_group_differences$confidence_ans <- user_response_diff_plot(confidence_ans_predictive_data, "participant_group", "confidence.ans", "Difference in Confidence in Answer Rating", "Task", NULL)
## `summarise()` regrouping output by 'participant_group', 'task' (override with `.groups` argument)
participant_group_differences$confidence_ans$plot

Efficiency: Building a Model

filename = "efficiency"
models$efficiency <- brm(
    formula = bf(efficiency ~ oracle * search * dataset * task + participant_group + (1 | participant_id)),
    family = cumulative("probit"),
   prior = prior(normal(0.26, 1.26), class = Intercept),
    chains = 2,
    cores = 2,
    iter = 2500,
    warmup = 1000,
    data = user_response_data,
    control = list(adapt_delta = 0.99),
    file = "models/efficiency",
    seed = seed
  )
## Compiling Stan program...
## Start sampling
## Warning: There were 4 transitions after warmup that exceeded the maximum treedepth. Increase max_treedepth above 10. See
## http://mc-stan.org/misc/warnings.html#maximum-treedepth-exceeded
## Warning: Examine the pairs() plot to diagnose sampling problems

Check some diagnostics regarding our model. Rhat should be close to 1 and Bulk_ESS should be in the thousands.

summary(models$efficiency)
##  Family: cumulative 
##   Links: mu = probit; disc = identity 
## Formula: efficiency ~ oracle * search * dataset * task + participant_group + (1 | participant_id) 
##    Data: user_response_data (Number of observations: 264) 
## Samples: 2 chains, each with iter = 2500; warmup = 1000; thin = 1;
##          total post-warmup samples = 3000
## 
## Group-Level Effects: 
## ~participant_id (Number of levels: 66) 
##               Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept)     1.23      0.18     0.91     1.62 1.00      762     1467
## 
## Population-Level Effects: 
##                                                          Estimate Est.Error
## Intercept[1]                                                -3.28      0.67
## Intercept[2]                                                -1.57      0.63
## Intercept[3]                                                -0.65      0.63
## Intercept[4]                                                 0.63      0.63
## oracleDziban                                                -0.44      0.80
## searchDFS                                                   -2.49      0.83
## datasetmovies                                               -0.33      0.84
## task2.RetrieveValue                                         -0.81      0.55
## task3.Prediction                                            -0.25      0.57
## task4.Exploration                                            0.03      0.56
## participant_groupstudent                                     0.31      0.35
## oracleDziban:searchDFS                                       1.75      1.16
## oracleDziban:datasetmovies                                  -0.78      1.13
## searchDFS:datasetmovies                                      1.42      1.15
## oracleDziban:task2.RetrieveValue                             0.67      0.77
## oracleDziban:task3.Prediction                                0.73      0.78
## oracleDziban:task4.Exploration                               0.51      0.77
## searchDFS:task2.RetrieveValue                                0.50      0.81
## searchDFS:task3.Prediction                                   0.37      0.83
## searchDFS:task4.Exploration                                  0.75      0.81
## datasetmovies:task2.RetrieveValue                            0.10      0.78
## datasetmovies:task3.Prediction                              -0.15      0.79
## datasetmovies:task4.Exploration                             -0.19      0.80
## oracleDziban:searchDFS:datasetmovies                         0.02      1.60
## oracleDziban:searchDFS:task2.RetrieveValue                  -1.20      1.12
## oracleDziban:searchDFS:task3.Prediction                     -0.93      1.11
## oracleDziban:searchDFS:task4.Exploration                    -1.52      1.10
## oracleDziban:datasetmovies:task2.RetrieveValue               0.23      1.09
## oracleDziban:datasetmovies:task3.Prediction                  1.09      1.09
## oracleDziban:datasetmovies:task4.Exploration                 0.79      1.10
## searchDFS:datasetmovies:task2.RetrieveValue                  0.36      1.10
## searchDFS:datasetmovies:task3.Prediction                     0.55      1.13
## searchDFS:datasetmovies:task4.Exploration                   -0.10      1.12
## oracleDziban:searchDFS:datasetmovies:task2.RetrieveValue     0.13      1.53
## oracleDziban:searchDFS:datasetmovies:task3.Prediction       -1.13      1.56
## oracleDziban:searchDFS:datasetmovies:task4.Exploration       1.01      1.55
##                                                          l-95% CI u-95% CI Rhat
## Intercept[1]                                                -4.63    -2.01 1.00
## Intercept[2]                                                -2.81    -0.35 1.00
## Intercept[3]                                                -1.86     0.55 1.00
## Intercept[4]                                                -0.62     1.87 1.00
## oracleDziban                                                -2.10     1.17 1.01
## searchDFS                                                   -4.16    -0.94 1.00
## datasetmovies                                               -1.96     1.33 1.00
## task2.RetrieveValue                                         -1.93     0.23 1.01
## task3.Prediction                                            -1.39     0.87 1.00
## task4.Exploration                                           -1.10     1.08 1.00
## participant_groupstudent                                    -0.36     1.00 1.00
## oracleDziban:searchDFS                                      -0.48     4.02 1.01
## oracleDziban:datasetmovies                                  -2.99     1.42 1.00
## searchDFS:datasetmovies                                     -0.81     3.62 1.00
## oracleDziban:task2.RetrieveValue                            -0.84     2.18 1.01
## oracleDziban:task3.Prediction                               -0.76     2.29 1.01
## oracleDziban:task4.Exploration                              -0.98     1.99 1.01
## searchDFS:task2.RetrieveValue                               -1.02     2.14 1.01
## searchDFS:task3.Prediction                                  -1.24     2.05 1.00
## searchDFS:task4.Exploration                                 -0.82     2.35 1.00
## datasetmovies:task2.RetrieveValue                           -1.40     1.67 1.00
## datasetmovies:task3.Prediction                              -1.68     1.41 1.00
## datasetmovies:task4.Exploration                             -1.78     1.39 1.00
## oracleDziban:searchDFS:datasetmovies                        -3.08     3.16 1.00
## oracleDziban:searchDFS:task2.RetrieveValue                  -3.45     0.91 1.01
## oracleDziban:searchDFS:task3.Prediction                     -3.06     1.17 1.01
## oracleDziban:searchDFS:task4.Exploration                    -3.70     0.68 1.01
## oracleDziban:datasetmovies:task2.RetrieveValue              -1.85     2.37 1.01
## oracleDziban:datasetmovies:task3.Prediction                 -1.02     3.17 1.00
## oracleDziban:datasetmovies:task4.Exploration                -1.35     2.93 1.01
## searchDFS:datasetmovies:task2.RetrieveValue                 -1.82     2.49 1.00
## searchDFS:datasetmovies:task3.Prediction                    -1.68     2.71 1.00
## searchDFS:datasetmovies:task4.Exploration                   -2.27     2.01 1.00
## oracleDziban:searchDFS:datasetmovies:task2.RetrieveValue    -2.82     3.12 1.01
## oracleDziban:searchDFS:datasetmovies:task3.Prediction       -4.22     1.79 1.00
## oracleDziban:searchDFS:datasetmovies:task4.Exploration      -2.08     4.06 1.01
##                                                          Bulk_ESS Tail_ESS
## Intercept[1]                                                  681     1130
## Intercept[2]                                                  645      973
## Intercept[3]                                                  640      992
## Intercept[4]                                                  627      953
## oracleDziban                                                  565      959
## searchDFS                                                     553      774
## datasetmovies                                                 508      824
## task2.RetrieveValue                                           720     1364
## task3.Prediction                                              682     1011
## task4.Exploration                                             760     1407
## participant_groupstudent                                      827     1334
## oracleDziban:searchDFS                                        480     1047
## oracleDziban:datasetmovies                                    498      706
## searchDFS:datasetmovies                                       450      904
## oracleDziban:task2.RetrieveValue                              698     1200
## oracleDziban:task3.Prediction                                 724     1197
## oracleDziban:task4.Exploration                                771     1432
## searchDFS:task2.RetrieveValue                                 634      991
## searchDFS:task3.Prediction                                    654     1225
## searchDFS:task4.Exploration                                   687      961
## datasetmovies:task2.RetrieveValue                             622     1295
## datasetmovies:task3.Prediction                                657      977
## datasetmovies:task4.Exploration                               717     1237
## oracleDziban:searchDFS:datasetmovies                          427      993
## oracleDziban:searchDFS:task2.RetrieveValue                    649     1065
## oracleDziban:searchDFS:task3.Prediction                       670     1185
## oracleDziban:searchDFS:task4.Exploration                      697     1181
## oracleDziban:datasetmovies:task2.RetrieveValue                667     1309
## oracleDziban:datasetmovies:task3.Prediction                   678     1200
## oracleDziban:datasetmovies:task4.Exploration                  718     1500
## searchDFS:datasetmovies:task2.RetrieveValue                   586      987
## searchDFS:datasetmovies:task3.Prediction                      647     1092
## searchDFS:datasetmovies:task4.Exploration                     655     1137
## oracleDziban:searchDFS:datasetmovies:task2.RetrieveValue      615     1296
## oracleDziban:searchDFS:datasetmovies:task3.Prediction         657     1421
## oracleDziban:searchDFS:datasetmovies:task4.Exploration        687     1199
## 
## Family Specific Parameters: 
##      Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## disc     1.00      0.00     1.00     1.00 1.00     3000     3000
## 
## Samples were drawn using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

Trace plots help us check whether there is evidence of non-convergence for our model.

# plot(models$efficiency)

In our pairs plots, we want to make sure we don’t have highly correlated parameters (highly correlated parameters means that our model has difficulty differentiating the effect of such parameters).

pairs(
  models$efficiency,
  pars = c("b_Intercept[1]",
           "b_Intercept[2]",
           "b_Intercept[3]",
           "b_Intercept[4]"),
  fixed = TRUE
)

pairs(
  models$efficiency,
   pars = c("b_datasetmovies",
           "b_oracledziban",
           "b_searchdfs",
           "b_task2.RetrieveValue",
           "b_task3.Prediction",
           "b_task4.Exploration"),
  fixed = TRUE
)

We now look at a response for efficiency using different combinations of search and oracle via draws from the model posterior. The thicker, shorter line represents the 95% credible interval, while the thinner, longer line represents the 50% credible interval.

efficiency_plot <- user_response_posterior_draws_plot(user_response_data, models$efficiency, NULL, "Oracle/Search Combination", "Rating")
efficiency_plot$plot

We can get the numeric values of the interval boundaries shown above with mean_qi

efficiency_plot$intervals
## # A tibble: 8 x 8
## # Groups:   search [2]
##   search oracle    rating  .lower  .upper .width .point .interval
##   <fct>  <fct>      <dbl>   <dbl>   <dbl>  <dbl> <chr>  <chr>    
## 1 BFS    CompassQL  0.705  0.125   1.23     0.95 mean   qi       
## 2 BFS    Dziban     0.608  0.0732  1.09     0.95 mean   qi       
## 3 DFS    CompassQL -0.249 -0.779   0.294    0.95 mean   qi       
## 4 DFS    Dziban     0.293 -0.234   0.828    0.95 mean   qi       
## 5 BFS    CompassQL  0.705  0.516   0.906    0.5  mean   qi       
## 6 BFS    Dziban     0.608  0.426   0.809    0.5  mean   qi       
## 7 DFS    CompassQL -0.249 -0.426  -0.0588   0.5  mean   qi       
## 8 DFS    Dziban     0.293  0.109   0.484    0.5  mean   qi
## Saving 7 x 5 in image

Efficiency: Differences Between Conditions

Next, we want to see if there is any significant difference in completion time between the two search algorithms (bfs and dfs) and the two oracles (dzbian and CompassQL).

efficiency_predictive_data <- user_response_data %>% add_predicted_draws(models$efficiency, seed = seed, re_formula = NA) 
efficiency_predictive_data$alg <- paste(efficiency_predictive_data$search, efficiency_predictive_data$oracle)

Differences in user score by search algorithm.

search_differences$efficiency <- user_response_diff_plot(efficiency_predictive_data, "search", "efficiency", "Difference in Efficiency Rating", "Task", NULL)
## `summarise()` regrouping output by 'search', 'task' (override with `.groups` argument)
search_differences$efficiency$plot

Differences in user score by oracle.

oracle_differences$efficiency <- user_response_diff_plot(efficiency_predictive_data, "oracle", "efficiency", "Difference in Efficiency Rating", "Task", NULL)
## `summarise()` regrouping output by 'oracle', 'task' (override with `.groups` argument)
oracle_differences$efficiency$plot

Differences in user score by search and oracle combination (DFS CompassQL vs BFS Dziban only)

efficiency_predictive_data_data_subset <- subset(efficiency_predictive_data, alg %in% c("DFS CompassQL", "BFS Dziban"))
alg_differences$efficiency <- user_response_diff_plot(efficiency_predictive_data_data_subset, "alg", "efficiency", "Difference in Efficiency Rating", "Task", NULL)
## `summarise()` regrouping output by 'alg', 'task' (override with `.groups` argument)
alg_differences$efficiency$plot

Differences in user score by participant group

participant_group_differences$efficiency <- user_response_diff_plot(efficiency_predictive_data, "participant_group", "efficiency", "Difference in Efficiency Rating", "Task", NULL)
## `summarise()` regrouping output by 'participant_group', 'task' (override with `.groups` argument)
participant_group_differences$efficiency$plot

Ease of Use: Building a Model

models$ease_of_use <- brm(
    formula = bf(ease.of.use ~ oracle * search * dataset * task + participant_group + (1 | participant_id)),
    family = cumulative("probit"),
   prior = prior(normal(0.26, 1.26), class = Intercept),
    chains = 2,
    cores = 2,
    iter = 2500,
    warmup = 1000,
    data = user_response_data,
    control = list(adapt_delta = 0.99),
    file = "models/ease_of_use",
    seed = seed
  )
## Compiling Stan program...
## Start sampling

Check some diagnostics regarding our model. Rhat should be close to 1 and Bulk_ESS should be in the thousands.

summary(models$ease_of_use)
##  Family: cumulative 
##   Links: mu = probit; disc = identity 
## Formula: ease.of.use ~ oracle * search * dataset * task + participant_group + (1 | participant_id) 
##    Data: user_response_data (Number of observations: 264) 
## Samples: 2 chains, each with iter = 2500; warmup = 1000; thin = 1;
##          total post-warmup samples = 3000
## 
## Group-Level Effects: 
## ~participant_id (Number of levels: 66) 
##               Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept)     1.13      0.16     0.83     1.47 1.01      831     1624
## 
## Population-Level Effects: 
##                                                          Estimate Est.Error
## Intercept[1]                                                -2.75      0.66
## Intercept[2]                                                -1.23      0.61
## Intercept[3]                                                -0.22      0.61
## Intercept[4]                                                 1.82      0.63
## oracleDziban                                                -0.28      0.79
## searchDFS                                                   -1.66      0.80
## datasetmovies                                               -0.51      0.80
## task2.RetrieveValue                                          0.76      0.59
## task3.Prediction                                             0.58      0.60
## task4.Exploration                                            1.35      0.63
## participant_groupstudent                                     0.55      0.33
## oracleDziban:searchDFS                                       1.01      1.09
## oracleDziban:datasetmovies                                   1.25      1.11
## searchDFS:datasetmovies                                      1.67      1.09
## oracleDziban:task2.RetrieveValue                            -0.92      0.79
## oracleDziban:task3.Prediction                                0.19      0.81
## oracleDziban:task4.Exploration                              -0.87      0.83
## searchDFS:task2.RetrieveValue                               -0.03      0.81
## searchDFS:task3.Prediction                                  -0.75      0.81
## searchDFS:task4.Exploration                                 -1.32      0.82
## datasetmovies:task2.RetrieveValue                            0.24      0.82
## datasetmovies:task3.Prediction                               0.69      0.84
## datasetmovies:task4.Exploration                             -0.47      0.84
## oracleDziban:searchDFS:datasetmovies                        -1.90      1.56
## oracleDziban:searchDFS:task2.RetrieveValue                  -0.13      1.11
## oracleDziban:searchDFS:task3.Prediction                     -0.08      1.14
## oracleDziban:searchDFS:task4.Exploration                     0.78      1.14
## oracleDziban:datasetmovies:task2.RetrieveValue              -0.46      1.10
## oracleDziban:datasetmovies:task3.Prediction                 -2.03      1.15
## oracleDziban:datasetmovies:task4.Exploration                -0.52      1.14
## searchDFS:datasetmovies:task2.RetrieveValue                 -1.17      1.12
## searchDFS:datasetmovies:task3.Prediction                    -0.00      1.12
## searchDFS:datasetmovies:task4.Exploration                    0.71      1.10
## oracleDziban:searchDFS:datasetmovies:task2.RetrieveValue     2.10      1.57
## oracleDziban:searchDFS:datasetmovies:task3.Prediction        2.12      1.59
## oracleDziban:searchDFS:datasetmovies:task4.Exploration       1.87      1.60
##                                                          l-95% CI u-95% CI Rhat
## Intercept[1]                                                -4.07    -1.48 1.00
## Intercept[2]                                                -2.46    -0.04 1.00
## Intercept[3]                                                -1.45     0.95 1.00
## Intercept[4]                                                 0.61     3.04 1.00
## oracleDziban                                                -1.85     1.22 1.00
## searchDFS                                                   -3.17    -0.03 1.00
## datasetmovies                                               -2.05     1.07 1.00
## task2.RetrieveValue                                         -0.42     1.91 1.00
## task3.Prediction                                            -0.56     1.78 1.00
## task4.Exploration                                            0.14     2.63 1.01
## participant_groupstudent                                    -0.07     1.21 1.00
## oracleDziban:searchDFS                                      -1.13     3.10 1.00
## oracleDziban:datasetmovies                                  -0.90     3.47 1.00
## searchDFS:datasetmovies                                     -0.51     3.89 1.00
## oracleDziban:task2.RetrieveValue                            -2.46     0.61 1.00
## oracleDziban:task3.Prediction                               -1.38     1.81 1.00
## oracleDziban:task4.Exploration                              -2.50     0.72 1.00
## searchDFS:task2.RetrieveValue                               -1.60     1.53 1.00
## searchDFS:task3.Prediction                                  -2.38     0.85 1.00
## searchDFS:task4.Exploration                                 -2.93     0.26 1.00
## datasetmovies:task2.RetrieveValue                           -1.38     1.84 1.00
## datasetmovies:task3.Prediction                              -1.03     2.28 1.00
## datasetmovies:task4.Exploration                             -2.15     1.14 1.00
## oracleDziban:searchDFS:datasetmovies                        -5.00     1.12 1.00
## oracleDziban:searchDFS:task2.RetrieveValue                  -2.30     2.01 1.00
## oracleDziban:searchDFS:task3.Prediction                     -2.37     2.15 1.00
## oracleDziban:searchDFS:task4.Exploration                    -1.46     2.98 1.00
## oracleDziban:datasetmovies:task2.RetrieveValue              -2.58     1.80 1.00
## oracleDziban:datasetmovies:task3.Prediction                 -4.27     0.27 1.00
## oracleDziban:datasetmovies:task4.Exploration                -2.67     1.69 1.00
## searchDFS:datasetmovies:task2.RetrieveValue                 -3.40     1.02 1.00
## searchDFS:datasetmovies:task3.Prediction                    -2.21     2.19 1.01
## searchDFS:datasetmovies:task4.Exploration                   -1.43     2.84 1.00
## oracleDziban:searchDFS:datasetmovies:task2.RetrieveValue    -0.88     5.19 1.00
## oracleDziban:searchDFS:datasetmovies:task3.Prediction       -0.98     5.34 1.01
## oracleDziban:searchDFS:datasetmovies:task4.Exploration      -1.25     4.96 1.00
##                                                          Bulk_ESS Tail_ESS
## Intercept[1]                                                  615      963
## Intercept[2]                                                  602     1115
## Intercept[3]                                                  607     1032
## Intercept[4]                                                  624     1183
## oracleDziban                                                  538     1141
## searchDFS                                                     538      866
## datasetmovies                                                 544      880
## task2.RetrieveValue                                           537     1389
## task3.Prediction                                              550     1106
## task4.Exploration                                             673     1423
## participant_groupstudent                                     1078     1408
## oracleDziban:searchDFS                                        590     1045
## oracleDziban:datasetmovies                                    590     1137
## searchDFS:datasetmovies                                       532      918
## oracleDziban:task2.RetrieveValue                              586     1436
## oracleDziban:task3.Prediction                                 516     1555
## oracleDziban:task4.Exploration                                691     1497
## searchDFS:task2.RetrieveValue                                 641     1263
## searchDFS:task3.Prediction                                    609      887
## searchDFS:task4.Exploration                                   707     1759
## datasetmovies:task2.RetrieveValue                             545     1273
## datasetmovies:task3.Prediction                                528     1240
## datasetmovies:task4.Exploration                               623     1381
## oracleDziban:searchDFS:datasetmovies                          564     1273
## oracleDziban:searchDFS:task2.RetrieveValue                    725     1351
## oracleDziban:searchDFS:task3.Prediction                       643     1391
## oracleDziban:searchDFS:task4.Exploration                      830     1694
## oracleDziban:datasetmovies:task2.RetrieveValue                589     1407
## oracleDziban:datasetmovies:task3.Prediction                   504     1014
## oracleDziban:datasetmovies:task4.Exploration                  717     1627
## searchDFS:datasetmovies:task2.RetrieveValue                   652     1469
## searchDFS:datasetmovies:task3.Prediction                      545      981
## searchDFS:datasetmovies:task4.Exploration                     619     1364
## oracleDziban:searchDFS:datasetmovies:task2.RetrieveValue      805     1534
## oracleDziban:searchDFS:datasetmovies:task3.Prediction         591     1623
## oracleDziban:searchDFS:datasetmovies:task4.Exploration        850     1774
## 
## Family Specific Parameters: 
##      Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## disc     1.00      0.00     1.00     1.00 1.00     3000     3000
## 
## Samples were drawn using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

Trace plots help us check whether there is evidence of non-convergence for our model.

# plot(models$ease_of_use)

In our pairs plots, we want to make sure we don’t have highly correlated parameters (highly correlated parameters means that our model has difficulty differentiating the effect of such parameters).

pairs(
  models$ease_of_use,
  pars = c("b_Intercept[1]",
           "b_Intercept[2]",
           "b_Intercept[3]",
           "b_Intercept[4]"),
  fixed = TRUE
)

pairs(
  models$ease_of_use,
   pars = c("b_datasetmovies",
           "b_oracledziban",
           "b_searchdfs",
           "b_task2.RetrieveValue",
           "b_task3.Prediction",
           "b_task4.Exploration"),
  fixed = TRUE
)

We now look at a response for ease of use using different combinations of search and oracle via draws from the model posterior. The thicker, shorter line represents the 95% credible interval, while the thinner, longer line represents the 50% credible interval.

ease_of_use_plot <- user_response_posterior_draws_plot(user_response_data, models$ease_of_use, NULL, "Oracle/Search Combination", "Rating")
ease_of_use_plot$plot

We can get the numeric values of the interval boundaries shown above with mean_qi

ease_of_use_plot$intervals
## # A tibble: 8 x 8
## # Groups:   search [2]
##   search oracle    rating  .lower .upper .width .point .interval
##   <fct>  <fct>      <dbl>   <dbl>  <dbl>  <dbl> <chr>  <chr>    
## 1 BFS    CompassQL 0.943   0.547   1.30    0.95 mean   qi       
## 2 BFS    Dziban    0.704   0.265   1.09    0.95 mean   qi       
## 3 DFS    CompassQL 0.0674 -0.412   0.5     0.95 mean   qi       
## 4 DFS    Dziban    0.407  -0.0312  0.812   0.95 mean   qi       
## 5 BFS    CompassQL 0.943   0.812   1.08    0.5  mean   qi       
## 6 BFS    Dziban    0.704   0.574   0.838   0.5  mean   qi       
## 7 DFS    CompassQL 0.0674 -0.0882  0.235   0.5  mean   qi       
## 8 DFS    Dziban    0.407   0.25    0.562   0.5  mean   qi
## Saving 7 x 5 in image

Ease of Use: Differences Between Conditions

Next, we want to see if there is any significant difference in completion time between the two search algorithms (bfs and dfs) and the two oracles (dzbian and CompassQL).

ease_of_use_predictive_data <- user_response_data %>% add_predicted_draws(models$ease_of_use, seed = seed, re_formula = NA) 
ease_of_use_predictive_data$alg <- paste(ease_of_use_predictive_data$search, ease_of_use_predictive_data$oracle)

Differences in user score by search algorithm.

search_differences$ease_of_use <- user_response_diff_plot(ease_of_use_predictive_data, "search", "ease.of.use", "Difference in Ease of Use Rating", "Task", NULL)
## `summarise()` regrouping output by 'search', 'task' (override with `.groups` argument)
search_differences$ease_of_use$plot

Differences in user score by oracle.

oracle_differences$ease_of_use <- user_response_diff_plot(ease_of_use_predictive_data, "oracle", "ease.of.use", "Difference in Ease of Use Rating", "Task", NULL)
## `summarise()` regrouping output by 'oracle', 'task' (override with `.groups` argument)
oracle_differences$ease_of_use$plot

Differences in user score by search and oracle combination (DFS CompassQL vs BFS Dziban only)

ease_of_use_predictive_data_subset <- subset(ease_of_use_predictive_data, alg %in% c("DFS CompassQL", "BFS Dziban"))

alg_differences$ease_of_use <- user_response_diff_plot(ease_of_use_predictive_data_subset, "alg", "ease.of.use", "Difference in Ease of Use Rating", "Task", NULL)
## `summarise()` regrouping output by 'alg', 'task' (override with `.groups` argument)
alg_differences$ease_of_use$plot

Differences in user score by participant group

participant_group_differences$ease_of_use <- user_response_diff_plot(ease_of_use_predictive_data, "participant_group", "ease.of.use", "Difference in Ease of Use Rating", "Task", NULL)
## `summarise()` regrouping output by 'participant_group', 'task' (override with `.groups` argument)
participant_group_differences$ease_of_use$plot

Utility: Building a Model

models$utility <- brm(
    formula = bf(utility ~ oracle * search * dataset * task + participant_group + (1 | participant_id)),
    family = cumulative("probit"),
   prior = prior(normal(0.26, 1.26), class = Intercept),
    chains = 2,
    cores = 2,
    iter = 2500,
    warmup = 1000,
    data = user_response_data,
    control = list(adapt_delta = 0.99),
    file = "models/utility",
    seed = seed
  )
## Compiling Stan program...
## Start sampling
## Warning: There were 1 transitions after warmup that exceeded the maximum treedepth. Increase max_treedepth above 10. See
## http://mc-stan.org/misc/warnings.html#maximum-treedepth-exceeded
## Warning: Examine the pairs() plot to diagnose sampling problems

Check some diagnostics regarding our model. Rhat should be close to 1 and Bulk_ESS should be in the thousands.

summary(models$utility)
##  Family: cumulative 
##   Links: mu = probit; disc = identity 
## Formula: utility ~ oracle * search * dataset * task + participant_group + (1 | participant_id) 
##    Data: user_response_data (Number of observations: 264) 
## Samples: 2 chains, each with iter = 2500; warmup = 1000; thin = 1;
##          total post-warmup samples = 3000
## 
## Group-Level Effects: 
## ~participant_id (Number of levels: 66) 
##               Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept)     1.06      0.16     0.77     1.41 1.00      787     1539
## 
## Population-Level Effects: 
##                                                          Estimate Est.Error
## Intercept[1]                                                -2.26      0.63
## Intercept[2]                                                -0.89      0.60
## Intercept[3]                                                -0.20      0.60
## Intercept[4]                                                 1.18      0.60
## oracleDziban                                                 0.31      0.77
## searchDFS                                                   -1.85      0.81
## datasetmovies                                               -0.13      0.79
## task2.RetrieveValue                                         -0.44      0.54
## task3.Prediction                                            -0.06      0.55
## task4.Exploration                                            0.41      0.56
## participant_groupstudent                                     0.31      0.30
## oracleDziban:searchDFS                                       0.65      1.12
## oracleDziban:datasetmovies                                  -1.00      1.07
## searchDFS:datasetmovies                                      1.23      1.09
## oracleDziban:task2.RetrieveValue                            -0.16      0.76
## oracleDziban:task3.Prediction                                0.06      0.77
## oracleDziban:task4.Exploration                              -0.48      0.78
## searchDFS:task2.RetrieveValue                                0.33      0.81
## searchDFS:task3.Prediction                                   0.25      0.79
## searchDFS:task4.Exploration                                  0.68      0.81
## datasetmovies:task2.RetrieveValue                           -0.68      0.78
## datasetmovies:task3.Prediction                               0.36      0.78
## datasetmovies:task4.Exploration                              0.19      0.79
## oracleDziban:searchDFS:datasetmovies                         0.24      1.54
## oracleDziban:searchDFS:task2.RetrieveValue                  -0.01      1.12
## oracleDziban:searchDFS:task3.Prediction                     -0.25      1.12
## oracleDziban:searchDFS:task4.Exploration                    -0.34      1.14
## oracleDziban:datasetmovies:task2.RetrieveValue               1.44      1.09
## oracleDziban:datasetmovies:task3.Prediction                  0.61      1.10
## oracleDziban:datasetmovies:task4.Exploration                 1.37      1.10
## searchDFS:datasetmovies:task2.RetrieveValue                  0.98      1.09
## searchDFS:datasetmovies:task3.Prediction                     0.10      1.07
## searchDFS:datasetmovies:task4.Exploration                   -0.82      1.10
## oracleDziban:searchDFS:datasetmovies:task2.RetrieveValue    -0.71      1.57
## oracleDziban:searchDFS:datasetmovies:task3.Prediction       -0.29      1.55
## oracleDziban:searchDFS:datasetmovies:task4.Exploration       0.23      1.57
##                                                          l-95% CI u-95% CI Rhat
## Intercept[1]                                                -3.48    -1.09 1.00
## Intercept[2]                                                -2.08     0.23 1.00
## Intercept[3]                                                -1.38     0.92 1.00
## Intercept[4]                                                -0.01     2.31 1.00
## oracleDziban                                                -1.28     1.75 1.01
## searchDFS                                                   -3.51    -0.37 1.00
## datasetmovies                                               -1.68     1.42 1.00
## task2.RetrieveValue                                         -1.46     0.60 1.01
## task3.Prediction                                            -1.13     1.02 1.01
## task4.Exploration                                           -0.68     1.49 1.01
## participant_groupstudent                                    -0.29     0.91 1.00
## oracleDziban:searchDFS                                      -1.51     2.89 1.00
## oracleDziban:datasetmovies                                  -3.06     1.17 1.01
## searchDFS:datasetmovies                                     -0.89     3.36 1.00
## oracleDziban:task2.RetrieveValue                            -1.67     1.30 1.00
## oracleDziban:task3.Prediction                               -1.43     1.54 1.00
## oracleDziban:task4.Exploration                              -1.96     1.04 1.01
## searchDFS:task2.RetrieveValue                               -1.25     1.89 1.01
## searchDFS:task3.Prediction                                  -1.31     1.77 1.01
## searchDFS:task4.Exploration                                 -0.90     2.28 1.01
## datasetmovies:task2.RetrieveValue                           -2.23     0.85 1.00
## datasetmovies:task3.Prediction                              -1.18     1.85 1.01
## datasetmovies:task4.Exploration                             -1.35     1.74 1.00
## oracleDziban:searchDFS:datasetmovies                        -2.76     3.25 1.00
## oracleDziban:searchDFS:task2.RetrieveValue                  -2.20     2.17 1.01
## oracleDziban:searchDFS:task3.Prediction                     -2.35     1.95 1.01
## oracleDziban:searchDFS:task4.Exploration                    -2.53     1.87 1.01
## oracleDziban:datasetmovies:task2.RetrieveValue              -0.66     3.62 1.00
## oracleDziban:datasetmovies:task3.Prediction                 -1.51     2.80 1.00
## oracleDziban:datasetmovies:task4.Exploration                -0.74     3.51 1.00
## searchDFS:datasetmovies:task2.RetrieveValue                 -1.07     3.17 1.00
## searchDFS:datasetmovies:task3.Prediction                    -1.94     2.18 1.01
## searchDFS:datasetmovies:task4.Exploration                   -3.00     1.42 1.01
## oracleDziban:searchDFS:datasetmovies:task2.RetrieveValue    -3.76     2.22 1.00
## oracleDziban:searchDFS:datasetmovies:task3.Prediction       -3.39     2.65 1.00
## oracleDziban:searchDFS:datasetmovies:task4.Exploration      -2.86     3.35 1.01
##                                                          Bulk_ESS Tail_ESS
## Intercept[1]                                                  452      929
## Intercept[2]                                                  459      869
## Intercept[3]                                                  474      969
## Intercept[4]                                                  481     1043
## oracleDziban                                                  436      854
## searchDFS                                                     424      878
## datasetmovies                                                 452      862
## task2.RetrieveValue                                           481     1197
## task3.Prediction                                              443     1201
## task4.Exploration                                             425     1087
## participant_groupstudent                                      751     1167
## oracleDziban:searchDFS                                        396      724
## oracleDziban:datasetmovies                                    424      832
## searchDFS:datasetmovies                                       431      824
## oracleDziban:task2.RetrieveValue                              471     1078
## oracleDziban:task3.Prediction                                 447     1202
## oracleDziban:task4.Exploration                                408     1114
## searchDFS:task2.RetrieveValue                                 445      992
## searchDFS:task3.Prediction                                    379      758
## searchDFS:task4.Exploration                                   386     1010
## datasetmovies:task2.RetrieveValue                             565     1065
## datasetmovies:task3.Prediction                                508     1428
## datasetmovies:task4.Exploration                               476     1054
## oracleDziban:searchDFS:datasetmovies                          400      726
## oracleDziban:searchDFS:task2.RetrieveValue                    422      807
## oracleDziban:searchDFS:task3.Prediction                       386     1009
## oracleDziban:searchDFS:task4.Exploration                      337      853
## oracleDziban:datasetmovies:task2.RetrieveValue                554     1210
## oracleDziban:datasetmovies:task3.Prediction                   511     1247
## oracleDziban:datasetmovies:task4.Exploration                  461     1241
## searchDFS:datasetmovies:task2.RetrieveValue                   483      787
## searchDFS:datasetmovies:task3.Prediction                      429     1001
## searchDFS:datasetmovies:task4.Exploration                     429      838
## oracleDziban:searchDFS:datasetmovies:task2.RetrieveValue      480      781
## oracleDziban:searchDFS:datasetmovies:task3.Prediction         429      868
## oracleDziban:searchDFS:datasetmovies:task4.Exploration        397      997
## 
## Family Specific Parameters: 
##      Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## disc     1.00      0.00     1.00     1.00 1.00     3000     3000
## 
## Samples were drawn using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

Trace plots help us check whether there is evidence of non-convergence for our model.

# plot(models$utility)

s plots, we want to make sure we don’t have highly correlated parameters (highly correlated parameters means that our model has difficulty differentiating the effect of such parameters).

pairs(
  models$utility,
  pars = c("b_Intercept[1]",
           "b_Intercept[2]",
           "b_Intercept[3]",
           "b_Intercept[4]"),
  fixed = TRUE
)

pairs(
  models$utility,
   pars = c("b_datasetmovies",
           "b_oracledziban",
           "b_searchdfs",
           "b_task2.RetrieveValue",
           "b_task3.Prediction",
           "b_task4.Exploration"),
  fixed = TRUE
)

We now look at a response for Utility using different combinations of search and oracle via draws from the model posterior. The thicker, shorter line represents the 95% credible interval, while the thinner, longer line represents the 50% credible interval.

utility_plot <- user_response_posterior_draws_plot(user_response_data, models$utility, NULL, "Oracle/Search Combination", "Rating")
utility_plot$plot

We can get the numeric values of the interval boundaries shown above with mean_qi

utility_plot$intervals
## # A tibble: 8 x 8
## # Groups:   search [2]
##   search oracle    rating  .lower .upper .width .point .interval
##   <fct>  <fct>      <dbl>   <dbl>  <dbl>  <dbl> <chr>  <chr>    
## 1 BFS    CompassQL  0.535 -0.0156  1.05    0.95 mean   qi       
## 2 BFS    Dziban     0.596  0.0441  1.09    0.95 mean   qi       
## 3 DFS    CompassQL -0.187 -0.706   0.353   0.95 mean   qi       
## 4 DFS    Dziban     0.301 -0.25    0.812   0.95 mean   qi       
## 5 BFS    CompassQL  0.535  0.359   0.719   0.5  mean   qi       
## 6 BFS    Dziban     0.596  0.426   0.765   0.5  mean   qi       
## 7 DFS    CompassQL -0.187 -0.382   0       0.5  mean   qi       
## 8 DFS    Dziban     0.301  0.125   0.484   0.5  mean   qi
## Saving 7 x 5 in image

Utility: Differences Between Conditions

Next, we want to see if there is any significant difference in completion time between the two search algorithms (bfs and dfs) and the two oracles (dzbian and CompassQL).

utility_predictive_data <- user_response_data %>% add_predicted_draws(models$utility, seed = seed, re_formula = NA) 
utility_predictive_data$alg <- paste(utility_predictive_data$search, utility_predictive_data$oracle)

Differences in user score by search algorithm.

search_differences$utility <- user_response_diff_plot(utility_predictive_data, "search", "utility", "Difference in Utility Rating", "Task", NULL)
## `summarise()` regrouping output by 'search', 'task' (override with `.groups` argument)
search_differences$utility$plot

Differences in user score by oracle.

oracle_differences$utility <- user_response_diff_plot(utility_predictive_data, "oracle", "utility", "Difference in Utility Rating", "Task", NULL)
## `summarise()` regrouping output by 'oracle', 'task' (override with `.groups` argument)
oracle_differences$utility$plot

Differences in user score by search and oracle combination (DFS CompassQL vs BFS Dziban only)

utility_predictive_data_subset <- subset(utility_predictive_data, alg %in% c("DFS CompassQL", "BFS Dziban"))
alg_differences$utility <- user_response_diff_plot(utility_predictive_data_subset, "alg", "utility", "Difference in Utility Rating", "Task", NULL)
## `summarise()` regrouping output by 'alg', 'task' (override with `.groups` argument)
alg_differences$utility$plot

Differences in user score by participant group

participant_group_differences$utility <- user_response_diff_plot(utility_predictive_data, "participant_group", "utility", "Difference in Utility Rating", "Task", NULL)
## `summarise()` regrouping output by 'participant_group', 'task' (override with `.groups` argument)
participant_group_differences$utility$plot

Overall: Building a Model

models$overall <- brm(
    formula = bf(overall ~ oracle * search * dataset * task + participant_group + (1 | participant_id)),
    family = cumulative("probit"),
   prior = prior(normal(0.26, 1.26), class = Intercept),
    chains = 2,
    cores = 2,
    iter = 2500,
    warmup = 1000,
    data = user_response_data,
    control = list(adapt_delta = 0.99),
    file = "models/overall",
    seed = seed
  )
## Compiling Stan program...
## Start sampling
## Warning: There were 18 transitions after warmup that exceeded the maximum treedepth. Increase max_treedepth above 10. See
## http://mc-stan.org/misc/warnings.html#maximum-treedepth-exceeded
## Warning: Examine the pairs() plot to diagnose sampling problems

Check some diagnostics regarding our model. Rhat should be close to 1 and Bulk_ESS should be in the thousands.

summary(models$overall)
##  Family: cumulative 
##   Links: mu = probit; disc = identity 
## Formula: overall ~ oracle * search * dataset * task + participant_group + (1 | participant_id) 
##    Data: user_response_data (Number of observations: 264) 
## Samples: 2 chains, each with iter = 2500; warmup = 1000; thin = 1;
##          total post-warmup samples = 3000
## 
## Group-Level Effects: 
## ~participant_id (Number of levels: 66) 
##               Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept)     1.69      0.22     1.29     2.16 1.00      846     1336
## 
## Population-Level Effects: 
##                                                          Estimate Est.Error
## Intercept[1]                                                -3.20      0.85
## Intercept[2]                                                -1.66      0.82
## Intercept[3]                                                -0.24      0.81
## Intercept[4]                                                 2.24      0.83
## oracleDziban                                                -0.81      1.04
## searchDFS                                                   -2.02      1.08
## datasetmovies                                               -0.83      1.05
## task2.RetrieveValue                                          0.01      0.63
## task3.Prediction                                             0.41      0.62
## task4.Exploration                                            1.18      0.66
## participant_groupstudent                                     0.78      0.47
## oracleDziban:searchDFS                                       2.04      1.52
## oracleDziban:datasetmovies                                   1.35      1.45
## searchDFS:datasetmovies                                      2.24      1.54
## oracleDziban:task2.RetrieveValue                             0.53      0.89
## oracleDziban:task3.Prediction                                1.69      0.90
## oracleDziban:task4.Exploration                               1.20      0.91
## searchDFS:task2.RetrieveValue                                0.90      0.87
## searchDFS:task3.Prediction                                   0.48      0.85
## searchDFS:task4.Exploration                                 -0.75      0.89
## datasetmovies:task2.RetrieveValue                            0.28      0.85
## datasetmovies:task3.Prediction                              -0.30      0.84
## datasetmovies:task4.Exploration                             -0.59      0.86
## oracleDziban:searchDFS:datasetmovies                        -2.27      2.12
## oracleDziban:searchDFS:task2.RetrieveValue                  -2.65      1.21
## oracleDziban:searchDFS:task3.Prediction                     -1.82      1.21
## oracleDziban:searchDFS:task4.Exploration                    -1.13      1.23
## oracleDziban:datasetmovies:task2.RetrieveValue              -1.31      1.22
## oracleDziban:datasetmovies:task3.Prediction                 -1.48      1.20
## oracleDziban:datasetmovies:task4.Exploration                -1.94      1.23
## searchDFS:datasetmovies:task2.RetrieveValue                 -1.50      1.18
## searchDFS:datasetmovies:task3.Prediction                    -0.59      1.15
## searchDFS:datasetmovies:task4.Exploration                    0.37      1.19
## oracleDziban:searchDFS:datasetmovies:task2.RetrieveValue     4.00      1.70
## oracleDziban:searchDFS:datasetmovies:task3.Prediction        1.35      1.65
## oracleDziban:searchDFS:datasetmovies:task4.Exploration       3.14      1.71
##                                                          l-95% CI u-95% CI Rhat
## Intercept[1]                                                -4.90    -1.56 1.00
## Intercept[2]                                                -3.34    -0.05 1.00
## Intercept[3]                                                -1.90     1.32 1.00
## Intercept[4]                                                 0.58     3.89 1.00
## oracleDziban                                                -2.82     1.20 1.00
## searchDFS                                                   -4.20     0.05 1.00
## datasetmovies                                               -2.86     1.32 1.00
## task2.RetrieveValue                                         -1.22     1.24 1.00
## task3.Prediction                                            -0.78     1.64 1.00
## task4.Exploration                                           -0.07     2.53 1.00
## participant_groupstudent                                    -0.15     1.69 1.00
## oracleDziban:searchDFS                                      -0.91     4.98 1.00
## oracleDziban:datasetmovies                                  -1.58     4.23 1.00
## searchDFS:datasetmovies                                     -0.88     5.26 1.00
## oracleDziban:task2.RetrieveValue                            -1.21     2.29 1.00
## oracleDziban:task3.Prediction                               -0.14     3.40 1.00
## oracleDziban:task4.Exploration                              -0.68     2.96 1.00
## searchDFS:task2.RetrieveValue                               -0.80     2.59 1.00
## searchDFS:task3.Prediction                                  -1.19     2.15 1.00
## searchDFS:task4.Exploration                                 -2.52     0.97 1.00
## datasetmovies:task2.RetrieveValue                           -1.32     1.97 1.00
## datasetmovies:task3.Prediction                              -2.00     1.25 1.00
## datasetmovies:task4.Exploration                             -2.28     1.08 1.00
## oracleDziban:searchDFS:datasetmovies                        -6.34     2.02 1.00
## oracleDziban:searchDFS:task2.RetrieveValue                  -5.03    -0.27 1.00
## oracleDziban:searchDFS:task3.Prediction                     -4.19     0.55 1.00
## oracleDziban:searchDFS:task4.Exploration                    -3.54     1.29 1.00
## oracleDziban:datasetmovies:task2.RetrieveValue              -3.74     1.09 1.00
## oracleDziban:datasetmovies:task3.Prediction                 -3.76     0.97 1.00
## oracleDziban:datasetmovies:task4.Exploration                -4.30     0.49 1.00
## searchDFS:datasetmovies:task2.RetrieveValue                 -3.86     0.76 1.00
## searchDFS:datasetmovies:task3.Prediction                    -2.81     1.71 1.00
## searchDFS:datasetmovies:task4.Exploration                   -1.96     2.67 1.00
## oracleDziban:searchDFS:datasetmovies:task2.RetrieveValue     0.73     7.29 1.00
## oracleDziban:searchDFS:datasetmovies:task3.Prediction       -2.05     4.55 1.00
## oracleDziban:searchDFS:datasetmovies:task4.Exploration      -0.26     6.49 1.00
##                                                          Bulk_ESS Tail_ESS
## Intercept[1]                                                  591     1120
## Intercept[2]                                                  574      857
## Intercept[3]                                                  570      881
## Intercept[4]                                                  613     1038
## oracleDziban                                                  552     1025
## searchDFS                                                     554     1169
## datasetmovies                                                 495      565
## task2.RetrieveValue                                           680     1386
## task3.Prediction                                              647     1328
## task4.Exploration                                             624     1503
## participant_groupstudent                                      579     1391
## oracleDziban:searchDFS                                        541     1016
## oracleDziban:datasetmovies                                    492      780
## searchDFS:datasetmovies                                       417      764
## oracleDziban:task2.RetrieveValue                              675     1706
## oracleDziban:task3.Prediction                                 671     1314
## oracleDziban:task4.Exploration                                618     1484
## searchDFS:task2.RetrieveValue                                 741     1375
## searchDFS:task3.Prediction                                    646     1553
## searchDFS:task4.Exploration                                   683     1595
## datasetmovies:task2.RetrieveValue                             694     1446
## datasetmovies:task3.Prediction                                724     1301
## datasetmovies:task4.Exploration                               701     1622
## oracleDziban:searchDFS:datasetmovies                          464      793
## oracleDziban:searchDFS:task2.RetrieveValue                    696     1663
## oracleDziban:searchDFS:task3.Prediction                       694     1221
## oracleDziban:searchDFS:task4.Exploration                      675     1488
## oracleDziban:datasetmovies:task2.RetrieveValue                780     1508
## oracleDziban:datasetmovies:task3.Prediction                   774     1457
## oracleDziban:datasetmovies:task4.Exploration                  713     1425
## searchDFS:datasetmovies:task2.RetrieveValue                   829     1546
## searchDFS:datasetmovies:task3.Prediction                      736     1496
## searchDFS:datasetmovies:task4.Exploration                     745     1485
## oracleDziban:searchDFS:datasetmovies:task2.RetrieveValue      857     1773
## oracleDziban:searchDFS:datasetmovies:task3.Prediction         820     1439
## oracleDziban:searchDFS:datasetmovies:task4.Exploration        811     1533
## 
## Family Specific Parameters: 
##      Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## disc     1.00      0.00     1.00     1.00 1.00     3000     3000
## 
## Samples were drawn using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

Trace plots help us check whether there is evidence of non-convergence for our model.

# plot(models$overall)

In our pairs plots, we want to make sure we don’t have highly correlated parameters (highly correlated parameters means that our model has difficulty differentiating the effect of such parameters).

pairs(
  models$overall,
  pars = c("b_Intercept[1]",
           "b_Intercept[2]",
           "b_Intercept[3]",
           "b_Intercept[4]"),
  fixed = TRUE
)

pairs(
  models$overall,
   pars = c("b_datasetmovies",
           "b_oracledziban",
           "b_searchdfs",
           "b_task2.RetrieveValue",
           "b_task3.Prediction",
           "b_task4.Exploration"),
  fixed = TRUE
)

We now look at a response for Overall using different combinations of search and oracle via draws from the model posterior. The thicker, shorter line represents the 95% credible interval, while the thinner, longer line represents the 50% credible interval.

overall_plot <- user_response_posterior_draws_plot(user_response_data, models$overall, NULL, "Oracle/Search Combination", "Rating")
overall_plot$plot

We can get the numeric values of the interval boundaries shown above with mean_qi

overall_plot$intervals
## # A tibble: 8 x 8
## # Groups:   search [2]
##   search oracle    rating  .lower .upper .width .point .interval
##   <fct>  <fct>      <dbl>   <dbl>  <dbl>  <dbl> <chr>  <chr>    
## 1 BFS    CompassQL  0.682  0.203   1.11    0.95 mean   qi       
## 2 BFS    Dziban     0.711  0.250   1.13    0.95 mean   qi       
## 3 DFS    CompassQL  0.178 -0.368   0.662   0.95 mean   qi       
## 4 DFS    Dziban     0.539  0.0469  0.984   0.95 mean   qi       
## 5 BFS    CompassQL  0.682  0.531   0.844   0.5  mean   qi       
## 6 BFS    Dziban     0.711  0.559   0.868   0.5  mean   qi       
## 7 DFS    CompassQL  0.178  0       0.368   0.5  mean   qi       
## 8 DFS    Dziban     0.539  0.391   0.703   0.5  mean   qi
## Saving 7 x 5 in image

Overall: Differences Between Conditions

Next, we want to see if there is any significant difference in completion time between the two search algorithms (bfs and dfs) and the two oracles (dzbian and CompassQL).

overall_predictive_data <- user_response_data %>% add_predicted_draws(models$overall, seed = seed, re_formula = NA) 
overall_predictive_data$alg <- paste(overall_predictive_data$search, overall_predictive_data$oracle)

Differences in user score by search algorithm.

search_differences$overall <- user_response_diff_plot(overall_predictive_data, "search", "overall", "Difference in Overall Rating", "Task", NULL)
## `summarise()` regrouping output by 'search', 'task' (override with `.groups` argument)
search_differences$overall$plot

Differences in user score by oracle.

oracle_differences$overall <- overall_predictive_data %>% 
  group_by(oracle, .draw) %>%
   summarize(rating = weighted.mean(as.numeric(.prediction))) %>%
   compare_levels(rating, by = oracle) %>%
   rename(diff_in_rating = rating)
## `summarise()` regrouping output by 'oracle' (override with `.groups` argument)
oracle_differences$overall$metric = "overall"

oracle_differences$overall %>%
      ggplot(aes(x = diff_in_rating, y = "overall")) +
      xlab(paste0("Expected Difference in Rating (",oracle_differences$overall[1,'oracle'],")")) + 
      ylab("Condition")+
      stat_halfeye(.width = c(.95, .5)) +
      geom_vline(xintercept = 0, linetype = "longdash") +
      theme_minimal()

oracle_differences$overall <- user_response_diff_plot(overall_predictive_data, "oracle", "overall", "Difference in Overall Rating", "Task", NULL)
## `summarise()` regrouping output by 'oracle', 'task' (override with `.groups` argument)
oracle_differences$overall$plot

Differences in user score by search and oracle combination (DFS CompassQL vs BFS Dziban only)

overall_predictive_data_subset <- subset(overall_predictive_data, alg %in% c("DFS CompassQL", "BFS Dziban"))
alg_differences$overall <- user_response_diff_plot(overall_predictive_data_subset, "alg", "overall", "Difference in Overall Rating", "Task", NULL)
## `summarise()` regrouping output by 'alg', 'task' (override with `.groups` argument)
alg_differences$overall$plot

Differences in user score by participant group

participant_group_differences$overall <- user_response_diff_plot(overall_predictive_data, "participant_group", "overall", "Difference in Overall Rating", "Task", NULL)
## `summarise()` regrouping output by 'participant_group', 'task' (override with `.groups` argument)
participant_group_differences$overall$plot

Summary Plots

Putting the all of the plots for search algorithm and oracle differences together, split by whether the rating metric is of type confidence or preference We’ll start with differences in search algorithms.

Differences in Search Algorithms

combined_search_differences <- rbind(
  search_differences$confidence_udata$differences, 
  search_differences$confidence_ans$differences, 
  search_differences$efficiency$differences,
  search_differences$ease_of_use$differences, 
  search_differences$utility$differences, 
  search_differences$overall$differences)
search_difference_plots_intervals <- user_response_diff_summary(combined_search_differences, 'search')
search_difference_plots_intervals$plot_confidence

View intervals

search_difference_plots_intervals$intervals_confidence
## # A tibble: 4 x 8
## # Groups:   search [1]
##   search    metric             difference .lower .upper .width .point .interval
##   <chr>     <fct>                   <dbl>  <dbl>  <dbl>  <dbl> <chr>  <chr>    
## 1 BFS - DFS Answer                -0.0752 -0.606 0.424    0.95 mean   qi       
## 2 BFS - DFS Understanding Data     0.0484 -0.485 0.636    0.95 mean   qi       
## 3 BFS - DFS Answer                -0.0752 -0.242 0.0909   0.5  mean   qi       
## 4 BFS - DFS Understanding Data     0.0484 -0.152 0.242    0.5  mean   qi
search_difference_plots_intervals$plot_preference

View intervals

search_difference_plots_intervals$intervals_preference
## # A tibble: 8 x 8
## # Groups:   search [1]
##   search    metric      difference  .lower .upper .width .point .interval
##   <chr>     <fct>            <dbl>   <dbl>  <dbl>  <dbl> <chr>  <chr>    
## 1 BFS - DFS Overall          0.344 -0.273   0.970   0.95 mean   qi       
## 2 BFS - DFS Utility          0.517 -0.364   1.33    0.95 mean   qi       
## 3 BFS - DFS Ease of Use      0.588 -0.0303  1.21    0.95 mean   qi       
## 4 BFS - DFS Efficiency       0.641 -0.121   1.39    0.95 mean   qi       
## 5 BFS - DFS Overall          0.344  0.121   0.545   0.5  mean   qi       
## 6 BFS - DFS Utility          0.517  0.242   0.818   0.5  mean   qi       
## 7 BFS - DFS Ease of Use      0.588  0.364   0.788   0.5  mean   qi       
## 8 BFS - DFS Efficiency       0.641  0.364   0.909   0.5  mean   qi

Differences in Oracle

combined_oracle_differences <- rbind(
  oracle_differences$confidence_udata$differences, 
  oracle_differences$confidence_ans$differences, 
  oracle_differences$efficiency$differences,
  oracle_differences$ease_of_use$differences, 
  oracle_differences$utility$differences, 
  oracle_differences$overall$differences)
oracle_difference_plots_intervals <- user_response_diff_summary(combined_oracle_differences, 'oracle')
oracle_difference_plots_intervals$plot_confidence

View intervals

oracle_difference_plots_intervals$intervals_confidence
## # A tibble: 4 x 8
## # Groups:   oracle [1]
##   oracle         metric        difference  .lower .upper .width .point .interval
##   <chr>          <fct>              <dbl>   <dbl>  <dbl>  <dbl> <chr>  <chr>    
## 1 Dziban - Comp… Answer             0.144 -0.394   0.727   0.95 mean   qi       
## 2 Dziban - Comp… Understandin…      0.209 -0.273   0.697   0.95 mean   qi       
## 3 Dziban - Comp… Answer             0.144 -0.0606  0.333   0.5  mean   qi       
## 4 Dziban - Comp… Understandin…      0.209  0.0606  0.364   0.5  mean   qi
oracle_difference_plots_intervals$plot_preference

View intervals

oracle_difference_plots_intervals$intervals_preference
## # A tibble: 8 x 8
## # Groups:   oracle [1]
##   oracle            metric     difference  .lower .upper .width .point .interval
##   <chr>             <fct>           <dbl>   <dbl>  <dbl>  <dbl> <chr>  <chr>    
## 1 Dziban - Compass… Overall        0.205  -0.515   0.879   0.95 mean   qi       
## 2 Dziban - Compass… Utility        0.290  -0.515   1.09    0.95 mean   qi       
## 3 Dziban - Compass… Ease of U…     0.0684 -0.606   0.727   0.95 mean   qi       
## 4 Dziban - Compass… Efficiency     0.242  -0.545   1.03    0.95 mean   qi       
## 5 Dziban - Compass… Overall        0.205  -0.0303  0.455   0.5  mean   qi       
## 6 Dziban - Compass… Utility        0.290   0.0303  0.545   0.5  mean   qi       
## 7 Dziban - Compass… Ease of U…     0.0684 -0.152   0.303   0.5  mean   qi       
## 8 Dziban - Compass… Efficiency     0.242  -0.0303  0.515   0.5  mean   qi

DFS CompassQL vs BFS Dziban

combined_alg_differences <- rbind(
  alg_differences$confidence_udata$differences, 
  alg_differences$confidence_ans$differences, 
  alg_differences$efficiency$differences,
  alg_differences$ease_of_use$differences, 
  alg_differences$utility$differences, 
  alg_differences$overall$differences)
alg_difference_plots_intervals <- user_response_diff_summary(combined_alg_differences, 'alg')
alg_difference_plots_intervals$plot_confidence

View intervals

alg_difference_plots_intervals$intervals_confidence
## # A tibble: 4 x 8
## # Groups:   alg [1]
##   alg              metric       difference .lower .upper .width .point .interval
##   <chr>            <fct>             <dbl>  <dbl>  <dbl>  <dbl> <chr>  <chr>    
## 1 BFS Dziban - DF… Answer           0.0663 -0.647  0.765   0.95 mean   qi       
## 2 BFS Dziban - DF… Understandi…     0.250  -0.471  1.06    0.95 mean   qi       
## 3 BFS Dziban - DF… Answer           0.0663 -0.176  0.294   0.5  mean   qi       
## 4 BFS Dziban - DF… Understandi…     0.250   0      0.529   0.5  mean   qi
alg_difference_plots_intervals$plot_preference

View intervals

alg_difference_plots_intervals$intervals_preference
## # A tibble: 8 x 8
## # Groups:   alg [1]
##   alg                 metric    difference .lower .upper .width .point .interval
##   <chr>               <fct>          <dbl>  <dbl>  <dbl>  <dbl> <chr>  <chr>    
## 1 BFS Dziban - DFS C… Overall        0.533 -0.412  1.47    0.95 mean   qi       
## 2 BFS Dziban - DFS C… Utility        0.783 -0.294  1.82    0.95 mean   qi       
## 3 BFS Dziban - DFS C… Ease of …      0.637 -0.294  1.53    0.95 mean   qi       
## 4 BFS Dziban - DFS C… Efficien…      0.857 -0.235  1.94    0.95 mean   qi       
## 5 BFS Dziban - DFS C… Overall        0.533  0.235  0.882   0.5  mean   qi       
## 6 BFS Dziban - DFS C… Utility        0.783  0.412  1.18    0.5  mean   qi       
## 7 BFS Dziban - DFS C… Ease of …      0.637  0.353  0.941   0.5  mean   qi       
## 8 BFS Dziban - DFS C… Efficien…      0.857  0.471  1.24    0.5  mean   qi

Differences in Participant Group

combined_participant_group_differences <- rbind(
  participant_group_differences$confidence_udata$differences, 
  participant_group_differences$confidence_ans$differences, 
  participant_group_differences$efficiency$differences,
  participant_group_differences$ease_of_use$differences, 
  participant_group_differences$utility$differences, 
  participant_group_differences$overall$differences)
participant_group_difference_plots_intervals <- user_response_diff_summary(combined_participant_group_differences, 'participant_group')
participant_group_difference_plots_intervals$plot_confidence

View intervals

participant_group_difference_plots_intervals$intervals_confidence
## # A tibble: 4 x 8
## # Groups:   participant_group [1]
##   participant_group  metric    difference  .lower .upper .width .point .interval
##   <chr>              <fct>          <dbl>   <dbl>  <dbl>  <dbl> <chr>  <chr>    
## 1 student - profess… Answer         0.134 -0.275   0.579   0.95 mean   qi       
## 2 student - profess… Understa…      0.216 -0.183   0.638   0.95 mean   qi       
## 3 student - profess… Answer         0.134 -0.0115  0.275   0.5  mean   qi       
## 4 student - profess… Understa…      0.216  0.0769  0.352   0.5  mean   qi
participant_group_difference_plots_intervals$plot_preference

View intervals

participant_group_difference_plots_intervals$intervals_preference
## # A tibble: 8 x 8
## # Groups:   participant_group [1]
##   participant_group   metric   difference  .lower .upper .width .point .interval
##   <chr>               <fct>         <dbl>   <dbl>  <dbl>  <dbl> <chr>  <chr>    
## 1 student - professi… Overall       0.413 -0.152   0.987   0.95 mean   qi       
## 2 student - professi… Utility       0.255 -0.412   0.913   0.95 mean   qi       
## 3 student - professi… Ease of…      0.344 -0.198   0.902   0.95 mean   qi       
## 4 student - professi… Efficie…      0.240 -0.412   0.896   0.95 mean   qi       
## 5 student - professi… Overall       0.413  0.217   0.606   0.5  mean   qi       
## 6 student - professi… Utility       0.255  0.0264  0.487   0.5  mean   qi       
## 7 student - professi… Ease of…      0.344  0.156   0.531   0.5  mean   qi       
## 8 student - professi… Efficie…      0.240  0.0115  0.467   0.5  mean   qi